We are asked to analyze movement and tracking data of GAStech employees to identity anomolies and suspicious behaviour.
GAStech is a company that is located in a country island of Kronos and it has come to their attention that some of the employees had mysteriously went missing. Vehicles tracking data that was secretly installed in the company’s cars and Kronos-Kares benefit card information are delivered to authorities for investigation.
packages = c('tidyverse', 'lubridate', 'dplyr', 'raster', 'clock', 'sf', 'tmap',
'plotly','ggplot2', 'mapview', 'rgdal','rgeos', 'tidyr', 'timevis')
for (p in packages) {
if (!require(p, character.only = T)) {
install.packages(p, repos = "http://cran.us.r-project.org")
}
library(p, character.only = T)
}
carAssignment <- read_csv("mc2/car-assignments.csv")
carAssignment
# A tibble: 44 x 5
LastName FirstName CarID CurrentEmploymentT… CurrentEmploymentT…
<chr> <chr> <dbl> <chr> <chr>
1 Calixto Nils 1 Information Techno… IT Helpdesk
2 Azada Lars 2 Engineering Engineer
3 Balas Felix 3 Engineering Engineer
4 Barranco Ingrid 4 Executive SVP/CFO
5 Baza Isak 5 Information Techno… IT Technician
6 Bergen Linnea 6 Information Techno… IT Group Manager
7 Orilla Elsa 7 Engineering Drill Technician
8 Alcazar Lucas 8 Information Techno… IT Technician
9 Cazar Gustav 9 Engineering Drill Technician
10 Campo-Corr… Ada 10 Executive SVP/CIO
# … with 34 more rows
ccData <- read_csv("MC2/cc_data.csv")
ccData$timestamp = date_time_parse(ccData$timestamp, zone = "", format = "%m/%d/%Y %H:%M")
ccData <- ccData %>%
mutate(date = as.Date(timestamp), time = strftime(timestamp, "%H:%M"), hr = strftime(timestamp, "%H"))
loyaltyData <- read_csv("MC2/loyalty_data.csv") %>%
mutate(date = as.Date(mdy(timestamp)))
ccLoyalty <- left_join(ccData, loyaltyData, by = c("date", "location", "price")) %>%
dplyr::select(timestamp.x, date, time, location, price, last4ccnum, loyaltynum, hr) %>%
rename(timestamp = timestamp.x) %>%
group_by(last4ccnum)
ccLoyalty$weekday = wday(ccLoyalty$date, label = TRUE, abbr = TRUE)
ccLoyalty$last4ccnum = as.character(ccLoyalty$last4ccnum)
ccLoyalty
# A tibble: 1,496 x 9
# Groups: last4ccnum [55]
timestamp date time location price last4ccnum
<dttm> <date> <chr> <chr> <dbl> <chr>
1 2014-01-06 07:28:00 2014-01-05 07:28 Brew've Been… 11.3 4795
2 2014-01-06 07:34:00 2014-01-05 07:34 Hallowed Gro… 52.2 7108
3 2014-01-06 07:35:00 2014-01-05 07:35 Brew've Been… 8.33 6816
4 2014-01-06 07:36:00 2014-01-05 07:36 Hallowed Gro… 16.7 9617
5 2014-01-06 07:37:00 2014-01-05 07:37 Brew've Been… 4.24 7384
6 2014-01-06 07:38:00 2014-01-05 07:38 Brew've Been… 4.17 5368
7 2014-01-06 07:42:00 2014-01-05 07:42 Coffee Camel… 28.7 7253
8 2014-01-06 07:43:00 2014-01-05 07:43 Brew've Been… 9.6 4948
9 2014-01-06 07:43:00 2014-01-05 07:43 Brew've Been… 16.9 9683
10 2014-01-06 07:47:00 2014-01-05 07:47 Hallowed Gro… 16.5 8129
# … with 1,486 more rows, and 3 more variables: loyaltynum <chr>,
# hr <chr>, weekday <ord>
# A tibble: 1,496 x 9
# Groups: last4ccnum [55]
timestamp date time location price last4ccnum
<dttm> <date> <chr> <chr> <dbl> <chr>
1 2014-01-06 07:28:00 2014-01-05 07:28 Brew've Been… 11.3 4795
2 2014-01-06 07:34:00 2014-01-05 07:34 Hallowed Gro… 52.2 7108
3 2014-01-06 07:35:00 2014-01-05 07:35 Brew've Been… 8.33 6816
4 2014-01-06 07:36:00 2014-01-05 07:36 Hallowed Gro… 16.7 9617
5 2014-01-06 07:37:00 2014-01-05 07:37 Brew've Been… 4.24 7384
6 2014-01-06 07:38:00 2014-01-05 07:38 Brew've Been… 4.17 5368
7 2014-01-06 07:42:00 2014-01-05 07:42 Coffee Camel… 28.7 7253
8 2014-01-06 07:43:00 2014-01-05 07:43 Brew've Been… 9.6 4948
9 2014-01-06 07:43:00 2014-01-05 07:43 Brew've Been… 16.9 9683
10 2014-01-06 07:47:00 2014-01-05 07:47 Hallowed Gro… 16.5 8129
# … with 1,486 more rows, and 3 more variables: loyaltynum <chr>,
# hr <chr>, weekday <ord>
ccLoyalty$location <- gsub("[\x92\xE2\x80\x99]", "", ccLoyalty$location)
ccLoyalty$location <- gsub("[\xfc\xbe\x8e\x96\x94\xbc]", "e", ccLoyalty$location)
ccLoyalty_person <- ccLoyalty %>%
group_by(last4ccnum) %>%
distinct(loyaltynum) %>%
arrange(last4ccnum) %>%
filter(loyaltynum != 'NA')
ccLoyalty_person$ccPerson <- ccLoyalty_person %>% group_indices(last4ccnum)
lcard <- ccLoyalty %>%
group_by(loyaltynum) %>%
distinct(last4ccnum) %>%
arrange(loyaltynum)
new <- merge(ccLoyalty_person, lcard, by="loyaltynum") %>%
arrange(ccPerson)
lookup <- new %>%
select(ccPerson, last4ccnum.y) %>%
arrange(ccPerson) %>%
distinct(last4ccnum.y, .keep_all = TRUE)
xdata <- inner_join(new, lookup, by=c("ccPerson", "last4ccnum.y")) %>%
arrange(ccPerson)
A single employee can carries multiple credit cards and loyalty cards
ccLoyalty_merge <- left_join(ccLoyalty, xdata, by=c("last4ccnum"="last4ccnum.y")) %>%
select(ccPerson, timestamp, weekday, timestamp, date, time, location, price, last4ccnum, loyaltynum.x, hr) %>%
arrange(ccPerson) %>%
distinct()
# reassign an id to be in running order
ccLoyalty_merge$personId <- ccLoyalty_merge %>% group_indices(ccPerson)
ccLoyalty_merge
# A tibble: 1,496 x 11
# Groups: last4ccnum [55]
ccPerson timestamp weekday date time location
<int> <dttm> <ord> <date> <chr> <chr>
1 1 2014-01-06 08:16:00 Mon 2014-01-06 08:16 Brew've Been…
2 1 2014-01-06 12:00:00 Mon 2014-01-06 12:00 Jack's Magic…
3 1 2014-01-06 13:27:00 Mon 2014-01-06 13:27 Abila Zacharo
4 1 2014-01-06 19:50:00 Mon 2014-01-06 19:50 Frydos Autos…
5 1 2014-01-07 07:54:00 Mon 2014-01-06 07:54 Brew've Been…
6 1 2014-01-07 12:00:00 Tue 2014-01-07 12:00 Jack's Magic…
7 1 2014-01-07 13:24:00 Tue 2014-01-07 13:24 Kalami Kafen…
8 1 2014-01-07 20:15:00 Tue 2014-01-07 20:15 Ouzeri Elian
9 1 2014-01-08 08:16:00 Wed 2014-01-08 08:16 Brew've Been…
10 1 2014-01-08 12:00:00 Wed 2014-01-08 12:00 Jack's Magic…
# … with 1,486 more rows, and 5 more variables: price <dbl>,
# last4ccnum <chr>, loyaltynum.x <chr>, hr <chr>, personId <int>
gps <- read_csv("MC2/gps.csv") %>%
mutate(date = as.Date(mdy_hms(Timestamp)), time = format(mdy_hms(Timestamp), "%H:%M"))
gps$Timestamp <- date_time_parse(gps$Timestamp, zone = "", format = "%m/%d/%Y %H:%M:%S")
gps$hr <- strftime(gps$Timestamp, "%H")
gps$id <- as_factor(gps$id)
gps$weekday <- wday(gps$date, label = TRUE, abbr = TRUE)
gps
# A tibble: 685,169 x 8
Timestamp id lat long date time hr
<dttm> <fct> <dbl> <dbl> <date> <chr> <chr>
1 2014-01-06 06:28:01 35 36.1 24.9 2014-01-06 06:28 06
2 2014-01-06 06:28:01 35 36.1 24.9 2014-01-06 06:28 06
3 2014-01-06 06:28:03 35 36.1 24.9 2014-01-06 06:28 06
4 2014-01-06 06:28:05 35 36.1 24.9 2014-01-06 06:28 06
5 2014-01-06 06:28:06 35 36.1 24.9 2014-01-06 06:28 06
6 2014-01-06 06:28:07 35 36.1 24.9 2014-01-06 06:28 06
7 2014-01-06 06:28:09 35 36.1 24.9 2014-01-06 06:28 06
8 2014-01-06 06:28:10 35 36.1 24.9 2014-01-06 06:28 06
9 2014-01-06 06:28:11 35 36.1 24.9 2014-01-06 06:28 06
10 2014-01-06 06:28:12 35 36.1 24.9 2014-01-06 06:28 06
# … with 685,159 more rows, and 1 more variable: weekday <ord>
I am eliminating coordinates that indicating that the car is moving The GPS car coordinates are recorded every 1-5 secs. Therefore, if there is a GPS record difference of more than 5 min, which means the employee has driven the car to a destination. Thus this eliminates possible traffic light stops and car moving in motion data.
For each employee: 1. I am getting the first and last car coordinate each day 2. Getting places of interest through the day
ts <- gps %>%
group_by(id) %>%
arrange(date, time, by_group=TRUE) %>%
mutate(diff = round(c(difftime(tail(Timestamp, -1), head(Timestamp, -1), units = "mins"), 0)), 2) %>%
mutate(count = 1:n(), FIRST = count == 1, LAST = count == max(count)) %>%
filter(diff > 5 | FIRST == TRUE | LAST == TRUE) %>%
arrange(id) %>%
select(id, lat, long, date, time, diff, hr, weekday, Timestamp)
ts
# A tibble: 3,133 x 9
# Groups: id [40]
id lat long date time diff hr weekday
<fct> <dbl> <dbl> <date> <chr> <drtn> <chr> <ord>
1 1 36.1 24.9 2014-01-06 07:20 0 mins 07 Mon
2 1 36.1 24.9 2014-01-06 07:22 35 mins 07 Mon
3 1 36.0 24.9 2014-01-06 08:04 253 mins 08 Mon
4 1 36.1 24.9 2014-01-06 12:26 59 mins 12 Mon
5 1 36.0 24.9 2014-01-06 13:34 250 mins 13 Mon
6 1 36.1 24.9 2014-01-06 17:48 108 mins 17 Mon
7 1 36.1 24.9 2014-01-06 19:42 7 mins 19 Mon
8 1 36.1 24.9 2014-01-06 19:49 38 mins 19 Mon
9 1 36.1 24.9 2014-01-06 20:33 98 mins 20 Mon
10 1 36.0 24.9 2014-01-06 22:15 46 mins 22 Mon
# … with 3,123 more rows, and 1 more variable: Timestamp <dttm>
OGR data source with driver: ESRI Shapefile
Source: "/Users/yuntinghong/Documents/SMU/ISSS608 - Visual Analytics/hongyunting/YTBlog_ISSS608/_posts/2021-07-20-assignment-mc2/MC2/Geospatial", layer: "Abila"
with 3290 features
It has 9 fields
Integer64 fields read as strings: TLID
Using just the credit and loyalty card data, identify the most popular locations, and when they are popular. What anomalies do you see? What corrections would you recommend to correct these anomalies?
| Observation | Supporting Evidence |
|---|---|
| 1. Figure 1 shows that “Katerina’s Cafe” is identified as the popular location as it has the highest number of transaction made within these 2 weeks, followed by “Hippokampos” and “Guy’s Gyros”. | ![]() |
Add the vehicle data to your analysis of the credit and loyalty card data. How does your assessment of the anomalies in question 1 change based on this new data? What discrepancies between vehicle, credit, and loyalty card data do you find?
gps_sf_path <- st_as_sf(ts, coords = c("long", "lat"), crs = 4326) %>%
#group_by(id) %>%
#summarize(Timestamp = mean(Timestamp),
# do_union=FALSE) %>%
st_cast("POINT")
# Map view of Abila, Kronos's route and Employees' whereabout
tmap_mode("view")
tm_shape(bgmap) +
tm_rgb(bgmap, r = 1, g = 2, b = 3, # setting red to band 1, green to band 2, blue to band 3
alpha = NA,
saturation = 1,
interpolate = TRUE,
max.value = 255) +
tmap_options(max.categories = 44) +
tm_shape(Abila_st) +
tm_lines() +
tm_shape(gps_sf_path) +
tm_dots(col ="id",
popup.vars=c("Date:"="date", "Time:"="time", "Day of Week:"="weekday", "Stopover duration (mins):"="diff"))
# A tibble: 3,133 x 9
# Groups: id [40]
id lat long date time diff hr weekday
<fct> <dbl> <dbl> <date> <chr> <drtn> <chr> <ord>
1 1 36.1 24.9 2014-01-06 07:20 0 mins 07 Mon
2 1 36.1 24.9 2014-01-06 07:22 35 mins 07 Mon
3 1 36.0 24.9 2014-01-06 08:04 253 mins 08 Mon
4 1 36.1 24.9 2014-01-06 12:26 59 mins 12 Mon
5 1 36.0 24.9 2014-01-06 13:34 250 mins 13 Mon
6 1 36.1 24.9 2014-01-06 17:48 108 mins 17 Mon
7 1 36.1 24.9 2014-01-06 19:42 7 mins 19 Mon
8 1 36.1 24.9 2014-01-06 19:49 38 mins 19 Mon
9 1 36.1 24.9 2014-01-06 20:33 98 mins 20 Mon
10 1 36.0 24.9 2014-01-06 22:15 46 mins 22 Mon
# … with 3,123 more rows, and 1 more variable: Timestamp <dttm>

